In [1]:

    
import numpy
import scipy
import pandas
import spacy
import textacy



In [2]:

    
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="whitegrid", color_codes=True)



In [3]:

    
corpus = textacy.Corpus.load(path='/home/immersinn/gits/ncga/data/processed/CORPUS_bills_filed_pipe01/',
                           name='CORPUS_bills_filed_pipe01',
                           compression='gzip')

Tokenize and Vectorize Corpus



In [4]:

    
terms_lists = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)
doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(\
                                                       terms_lists, 
                                                       weighting='tfidf', normalize=True, smooth_idf=True,
                                                       min_df=3, max_df=0.95, max_n_terms=1000)
doc_term_matrix









    Out[4]:





<2098x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 247753 stored elements in Compressed Sparse Row format>

Topic Model



In [7]:

    
n_topics = 4
model = textacy.tm.TopicModel('lda', n_topics=n_topics)
model.fit(doc_term_matrix)

Transform the corpus and Interpret the Model:



In [8]:

    
doc_topic_matrix = model.transform(doc_term_matrix)



In [9]:

    
doc_topic_matrix.shape









    Out[9]:





(2098, 4)



In [10]:

    
# Rows sum to unity...
doc_topic_matrix[:10,:]









    Out[10]:





array([[ 0.04006075,  0.88406709,  0.03798571,  0.03788646],
       [ 0.07234805,  0.78724429,  0.07004595,  0.07036171],
       [ 0.01829032,  0.94812449,  0.01683505,  0.01675014],
       [ 0.06037106,  0.83623807,  0.05153815,  0.05185272],
       [ 0.04981398,  0.81500027,  0.0878845 ,  0.04730126],
       [ 0.03141068,  0.91373558,  0.02723571,  0.02761803],
       [ 0.24806708,  0.66059208,  0.04549823,  0.04584261],
       [ 0.02670665,  0.92355787,  0.02480258,  0.0249329 ],
       [ 0.02614035,  0.88732876,  0.06103577,  0.02549512],
       [ 0.57191424,  0.36283075,  0.03251098,  0.03274404]])

View Top Terms, Top Keywords from Top Documents



In [11]:

    
pandas.Series(doc_topic_matrix.reshape((doc_topic_matrix.shape[0]*doc_topic_matrix.shape[1],))).describe(percentiles=[0.75, 0.80, 0.90, 0.95, 0.975, 0.99])









    Out[11]:





count    8392.000000
mean        0.250000
std         0.329383
min         0.015022
50%         0.047603
75%         0.450906
80%         0.670733
90%         0.870961
95%         0.892481
97.5%       0.909019
99%         0.920180
max         0.948124
dtype: float64



In [12]:

    
sum(doc_topic_matrix > 0.1)









    Out[12]:





array([ 748, 1706,  102,  250])



In [13]:

    
top_term_table = []
index = []
for topic_idx, top_terms in model.top_topic_terms(id2term, top_n = 15, topics=range(n_topics)):
    index.append('topic ' + str(topic_idx))
    top_term_table.append({i : tt for i,tt in enumerate(top_terms)})
top_term_table = pandas.DataFrame(data=top_term_table, index=index)



In [14]:

    
top_term_table.transpose()









    Out[14]:






  
    
      
      topic 0
      topic 1
      topic 2
      topic 3
    
  
  
    
      0
      school
      shall
      plate
      tax
    
    
      1
      fund
      state
      animal
      taxpayer
    
    
      2
      student
      person
      foot
      income
    
    
      3
      education
      district
      registration
      taxable
    
    
      4
      program
      board
      town
      sale
    
    
      5
      shall
      member
      veteran
      credit
    
    
      6
      health
      commission
      south
      property
    
    
      7
      year
      law
      special
      levy
    
    
      8
      board
      county
      page
      food
    
    
      9
      state
      senate
      describe
      investment
    
    
      10
      department
      's
      tract
      gross
    
    
      11
      fiscal
      service
      point
      year
    
    
      12
      teacher
      article
      corporate
      business
    
    
      13
      child
      provide
      map
      refund
    
    
      14
      local
      resolution
      book
      fuel

Density-based Method for Adaptive LDA Model Selection



In [16]:

    
model.model.components_.shape









    Out[16]:





(4, 1000)



In [17]:

    
model.model.components_[:,:4]









    Out[17]:





array([[  0.25977875,   0.25774284,   0.47202631,   4.83220664],
       [  5.0444299 ,  16.20158631,   6.13205057,  11.19989377],
       [  0.25151976,   0.26379353,   1.25476955,   0.26803282],
       [  0.25731468,   0.27131475,   0.74280387,   1.68892231]])



In [118]:

    
# Calculate correlations between Topics in a given Model via the Topic - Term Weighting Vectors
def calc_p_topic_given_word(model, doc_topic_matrix, n_topics):
    # 01: Convert model components to a distribution --> p(w|t)
    p_word_given_topic = model.model.components_.copy()
    p_word_given_topic = p_word_given_topic / p_word_given_topic.sum(axis=1).reshape((n_topics,1))

    # 02: Calculate estimate of p(t) from corpus
    p_topics = (doc_topic_matrix.sum(axis=0) / len(corpus)).reshape((n_topics, 1))
    p_topics = p_topics + (1- p_topics.sum()) / n_topics  # Comp round error correct
    
    # 03: p(t|w) = p(w|t) * p(t) / p(w)
    p_words = doc_term_matrix.sum(axis=0) / doc_term_matrix.sum()
    p_words = p_words + (1 - p_words.sum()) / doc_term_matrix.shape[1]  # Comp round error correct
    
    # 04: Calculate p(t|w)
    p_topic_given_word = p_word_given_topic * p_topics / p_words
    p_topic_given_word = p_topic_given_word + (1 - p_topic_given_word.sum(axis=0)) / n_topics  #Comp round error correct

    return(p_topic_given_word)


def topic_cosdists(model, doc_topic_matrix, n_topics):
    ptw = calc_p_topic_given_word(model, doc_topic_matrix, n_topics)
    cos_dists = scipy.spatial.distance.squareform(\
                                                  scipy.spatial.distance.pdist(ptw, 'cosine')
                                                 )
    return(cos_dists)


def average_topic_distances(topic_distances, n_topics):
    if topic_distances.shape[0] == n_topics:
        if topic_distances.shape[1] == n_topics:
            topic_distances = topic_distances[scipy.tril_indices_from(corre_topics, -1)]
    else:
        raise ValueError
    return(topic_distances.sum() / (n_topics * (n_topics-1) / 2))



In [109]:

    
ptw = calc_p_topic_given_word(model, doc_topic_matrix, n_topics)



In [113]:

    
corre_topics = topic_cosdists(model, doc_topic_matrix, n_topics)
corre_topics









    Out[113]:





array([[ 0.        ,  0.53400191,  0.79761223,  0.81960128],
       [ 0.53400191,  0.        ,  0.62390567,  0.70115904],
       [ 0.79761223,  0.62390567,  0.        ,  0.95051313],
       [ 0.81960128,  0.70115904,  0.95051313,  0.        ]])



In [115]:

    
ave_dis = average_topic_distances(corre_topics, n_topics)
ave_dis









    Out[115]:





0.73779887723896087

Test Multiple Models



In [141]:

    
models = {}
models_dtms = {}
models_ave_dis = []

for n_topics in range(3,31,3):
    print('Fitting model for {} topics...'.format(n_topics))
    # Fit model
    model = textacy.tm.TopicModel('lda', n_topics=n_topics)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)
    
    # Calculate avg cosine similarity data
    corre_topics = topic_cosdists(model, doc_topic_matrix, n_topics)
    ave_dis = average_topic_distances(corre_topics, n_topics)
    
    # Store data
    models[n_topics] = model
    models_dtms[n_topics] = doc_topic_matrix
    models_ave_dis.append({'n_topics' : n_topics,  'ave_dis' : ave_dis})









    



Fitting model for 3 topics...
Fitting model for 6 topics...
Fitting model for 9 topics...
Fitting model for 12 topics...
Fitting model for 15 topics...
Fitting model for 18 topics...
Fitting model for 21 topics...
Fitting model for 24 topics...
Fitting model for 27 topics...
Fitting model for 30 topics...



In [151]:

    
models_ave_dis = pandas.DataFrame(models_ave_dis)
models_ave_dis.sort_values(by='n_topics', inplace=True)
models_ave_dis.columns = ['avg_dist', 'n_topics']
models_ave_dis



In [155]:

    
plt.plot(models_ave_dis.n_topics, models_ave_dis.avg_dist)









    Out[155]:





[<matplotlib.lines.Line2D at 0x7f799a8dd320>]

Perplexity



In [169]:

    
from IPython.display import Image

Do we need to build models with holdout data / cross fold / etc in order to get this metric accuratly (see paper and below)?



In [170]:

    
Image(filename='images/perplexity_def.png')









    Out[170]:

Below we use sklearn's perpelxity measure (see here and here)



In [162]:

    
perplexity = [{'n_topics' : key,
               'perp' : models[key].model.perplexity(doc_term_matrix)} \
              for key in models.keys()]



In [163]:

    
perplexity = pandas.DataFrame(perplexity)
perplexity.sort_values(by='n_topics', inplace=True)



In [168]:

    
plt.plot(perplexity.n_topics, perplexity.perp)









    Out[168]:





[<matplotlib.lines.Line2D at 0x7f7992000048>]



In [165]:

    
perplexity









    Out[165]:






  
    
      
      n_topics
      perp
    
  
  
    
      1
      3
      2.996926e+03
    
    
      3
      6
      3.401317e+04
    
    
      5
      9
      3.205789e+05
    
    
      7
      12
      2.534031e+06
    
    
      9
      15
      2.160770e+07
    
    
      0
      18
      1.793491e+08
    
    
      2
      21
      1.459009e+09
    
    
      4
      24
      1.214828e+10
    
    
      6
      27
      9.094428e+10
    
    
      8
      30
      7.133521e+11



In [ ]:

	avg_dist	n_topics
0	0.670684	3
1	0.645993	6
2	0.580565	9
3	0.787569	12
4	0.559266	15
5	0.674486	18
6	0.486673	21
7	0.492799	24
8	0.513956	27
9	0.486868	30

	topic 0	topic 1	topic 2	topic 3
0	school	shall	plate	tax
1	fund	state	animal	taxpayer
2	student	person	foot	income
3	education	district	registration	taxable
4	program	board	town	sale
5	shall	member	veteran	credit
6	health	commission	south	property
7	year	law	special	levy
8	board	county	page	food
9	state	senate	describe	investment
10	department	's	tract	gross
11	fiscal	service	point	year
12	teacher	article	corporate	business
13	child	provide	map	refund
14	local	resolution	book	fuel

	n_topics	perp
1	3	2.996926e+03
3	6	3.401317e+04
5	9	3.205789e+05
7	12	2.534031e+06
9	15	2.160770e+07
0	18	1.793491e+08
2	21	1.459009e+09
4	24	1.214828e+10
6	27	9.094428e+10
8	30	7.133521e+11